{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 设置基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "阿里研究院\n",
    "阿里健康\n",
    "阿里巴巴商学院\n",
    "阿里数据\n",
    "\n",
    "腾讯金融科技\n",
    "腾讯研究院\n",
    "腾讯媒体研究院\n",
    "腾讯云启研究院\n",
    "酷鹅用户研究院\n",
    "'''\n",
    "公众号 = \"概率论\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用selenium 进入微信公众号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf-8\n",
    "from selenium import webdriver\n",
    "import time\n",
    "\n",
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")    # 打开百度浏览器\n",
    "wd.find_element_by_id(\"kw\").send_keys(\"selenium\")   # 定位输入框并输入关键字\n",
    "wd.find_element_by_id(\"su\").click()   #点击[百度一下]搜索\n",
    "time.sleep(3)   #等待3秒\n",
    "wd.quit()   #关闭浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-5-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 自动化登录（需扫码）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 自动化账号密码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"vicky-newmedia@qq.com\", \"password\": \"X202071abc@\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# payload =  {\"account\": \"vicky-newmedia@qq.com\", \"password\": \"X202071abc@\"}\n",
    "# 切换为账号密码登录\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()\n",
    "# element.get_attribute('innerHTML')\n",
    "# element.click"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空账号input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空密码input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理验证码(若没弹出省略此步骤)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/verifycode?username=vicky-newmedia@qq.com&r=1621472745304'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "verifycode_image_url = driver.find_element_by_xpath('//form[@class=\"login_form\"]//div[@class=\"verifycode\"]/img').get_attribute(\"src\")\n",
    "verifycode_image_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'24.7139ffea01c38563761de348621c60a4.2592000.1624064749.282335-20147076'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 百度API\n",
    "import requests \n",
    "\n",
    "# client_id 为官网获取的AK， client_secret 为官网获取的SK\n",
    "host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=9auo2pBlmStXYAkd0t3V60N8&client_secret=PP5aCfUh21sB6mPZvoE7FlGVw9GgZnGA'\n",
    "response = requests.get(host)\n",
    "if response:\n",
    "#     print(response.json())\n",
    "    \n",
    "    access_token = response.json()[\"access_token\"]\n",
    "access_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Response [200]>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "request_url = \"https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic\"\n",
    "params = {\n",
    "    \"url\":verifycode_image_url\n",
    "}\n",
    "request_url = request_url + \"?access_token=\" + access_token\n",
    "headers = {'content-type': 'application/x-www-form-urlencoded'}\n",
    "response = requests.post(request_url, data=params, headers=headers)\n",
    "response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'T'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "verifycode=response.json()[\"words_result\"][0][\"words\"]\n",
    "verifycode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//div[@class=\"verifycode\"]//input').send_keys(verifycode)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 找选单"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击图文素材"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/appmsg?begin=0&count=10&type=10&action=list_card&token=1219283422&lang=zh_CN'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//li[@title[contains(.,\"图文素材\")]]/a') \n",
    "# main_content = element.get_attribute('innerHTML')\n",
    "# main_content\n",
    "url2= element.get_attribute(\"href\")\n",
    "url2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(url2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击+\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[1]/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 新建图文消息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 新建图文消息\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[2]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-136369A3E353C13EBE242C95DC65887B', 'CDwindow-C9DBF5D511C3EC139EEE0284242B4F87']\n"
     ]
    }
   ],
   "source": [
    "# 检查窗口信息\n",
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新建图文消息开了另一分视窗，所以要切换 switch_to \n",
    "driver.switch_to.window(driver.window_handles[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 超链接 在微信公众平台寻找指定的公众号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "超链接\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"超链接\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "选择其他公众号\n"
     ]
    }
   ],
   "source": [
    "# 点 选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"选择其他公众号\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Dje3o9MhibXgOfTEQZnf0RQZrIwV1o0jRrLfjd8ibdKbceY0DSmRSicdshZu0JF25jg5qcqB1jZJJiajQ5zUiaJn8LQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">概率论</strong> <i class=\"inner_link_account_wechat\">微信号：ilovexiaogai</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/5EUcOMTqUyQKcuicWNmicyOPiaaicL4eoiboHaTVmlzIQ9O3NZ6p0xwkpY1ztPKyQ3t41SibkNqY9PwL4S5XFwuhhW9w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">概率论社交实验站</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/sz_mmbiz_png/FWjW9eJmCwvMtTu9THsYfLYsIZUibFfwicXtwFnFvYSko4bOYXfISnxLJzKCESf6UoRHaz3OUYaJUeWzh3Xo7jtg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">微积分与概率论</strong> <i class=\"inner_link_account_wechat\">微信号：mathtsing</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/iczoVwfI5GiaPt1w4NmhbTwFvDfKicR4perIg7XBmMFmTSviabcsGVOnJQcSPhTN8oItriamGmLKnayjjtOtnKrLUdw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">余丙森概率论</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/eQoM8wR3Nibx66TK9JsmxdKnWR3gcmzeyYZXXpZ9fpwPezxRyqicaSDia7pZ3B05vfYdcdMDYVgWZUXtlibiamLnwtg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">排列五概率论与数理统计</strong> <i class=\"inner_link_account_wechat\">微信号：qxctw2236350405</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>概率论</td>\n",
       "      <td>微信号：ilovexiaogai</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Dje3o9MhibXgOfT...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>概率论社交实验站</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/5EUcOMTqUyQKcui...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>微积分与概率论</td>\n",
       "      <td>微信号：mathtsing</td>\n",
       "      <td>http://mmbiz.qpic.cn/sz_mmbiz_png/FWjW9eJmCwvM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>余丙森概率论</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/iczoVwfI5GiaPt1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>排列五概率论与数理统计</td>\n",
       "      <td>微信号：qxctw2236350405</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/eQoM8wR3Nibx66T...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      nickname               wechat  \\\n",
       "0          概率论     微信号：ilovexiaogai   \n",
       "1     概率论社交实验站              微信号：未设置   \n",
       "2      微积分与概率论        微信号：mathtsing   \n",
       "3       余丙森概率论              微信号：未设置   \n",
       "4  排列五概率论与数理统计  微信号：qxctw2236350405   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/Dje3o9MhibXgOfT...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/5EUcOMTqUyQKcui...  \n",
       "2  http://mmbiz.qpic.cn/sz_mmbiz_png/FWjW9eJmCwvM...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/iczoVwfI5GiaPt1...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/eQoM8wR3Nibx66T...  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Dje3o9MhibXgOfTEQZnf0RQZrIwV1o0jRrLfjd8ibdKbceY0DSmRSicdshZu0JF25jg5qcqB1jZJJiajQ5zUiaJn8LQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">概率论</strong> <i class=\"inner_link_account_wechat\">微信号：ilovexiaogai</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 216]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 循环遍历  抓取概率论公众号指定时间区间的文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(20+1*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t57\t58\t59\t60\t61\t62\t63\t64\t65\t66\t67\t68\t69\t70\t71\t72\t73\t74\t75\t76\t77\t78\t79\t80\t81\t82\t83\t84\t85\t86\t87\t88\t89\t90\t91\t92\t93\t94\t95\t96\t97\t98\t99\t100\t101\t102\t103\t104\t105\t106\t107\t108\t109\t110\t111\t112\t113\t114\t115\t116\t117\t118\t119\t120\t121\t122\t123\t124\t125\t126\t127\t128\t129\t130\t131\t132\t133\t134\t135\t136\t137\t138\t139\t140\t141\t142\t143\t144\t145\t146\t147\t148\t149\t150\t151\t152\t153\t154\t155\t156\t157\t158\t159\t160\t161\t162\t163\t164\t165\t166\t167\t168\t169\t170\t171\t172\t173\t174\t175\t176\t177\t178\t179\t180\t181\t182\t183\t184\t185\t186\t187\t188\t189\t190\t191\t192\t193\t194\t195\t196\t197\t198\t199\t200\t201\t202\t203\t204\t205\t206\t207\t208\t209\t210\t211\t212\t213\t214\t215\t216\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "216"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# current\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>212</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>215</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>216</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>216 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "1    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5    <div class=\"weui-desktop-radio-group\"><label c...\n",
       "..                                                 ...\n",
       "212  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "213  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "214  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "215  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "216  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "\n",
       "[216 rows x 1 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>212</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>215</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>216</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>157 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "12   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "61   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "62   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "63   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "64   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "..                                                 ...\n",
       "212  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "213  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "214  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "215  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "216  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "\n",
       "[157 rows x 1 columns]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12,\n",
       " 61,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 84,\n",
       " 85,\n",
       " 86,\n",
       " 87,\n",
       " 88,\n",
       " 89,\n",
       " 90,\n",
       " 91,\n",
       " 92,\n",
       " 93,\n",
       " 94,\n",
       " 95,\n",
       " 96,\n",
       " 97,\n",
       " 98,\n",
       " 99,\n",
       " 100,\n",
       " 101,\n",
       " 102,\n",
       " 103,\n",
       " 104,\n",
       " 105,\n",
       " 106,\n",
       " 107,\n",
       " 108,\n",
       " 109,\n",
       " 110,\n",
       " 111,\n",
       " 112,\n",
       " 113,\n",
       " 114,\n",
       " 115,\n",
       " 116,\n",
       " 117,\n",
       " 118,\n",
       " 119,\n",
       " 120,\n",
       " 121,\n",
       " 122,\n",
       " 123,\n",
       " 124,\n",
       " 125,\n",
       " 126,\n",
       " 127,\n",
       " 128,\n",
       " 129,\n",
       " 130,\n",
       " 131,\n",
       " 132,\n",
       " 133,\n",
       " 134,\n",
       " 135,\n",
       " 136,\n",
       " 137,\n",
       " 138,\n",
       " 139,\n",
       " 140,\n",
       " 141,\n",
       " 142,\n",
       " 143,\n",
       " 144,\n",
       " 145,\n",
       " 146,\n",
       " 147,\n",
       " 148,\n",
       " 149,\n",
       " 150,\n",
       " 151,\n",
       " 152,\n",
       " 153,\n",
       " 154,\n",
       " 155,\n",
       " 156,\n",
       " 157,\n",
       " 158,\n",
       " 159,\n",
       " 160,\n",
       " 161,\n",
       " 162,\n",
       " 163,\n",
       " 164,\n",
       " 165,\n",
       " 166,\n",
       " 167,\n",
       " 168,\n",
       " 169,\n",
       " 170,\n",
       " 171,\n",
       " 172,\n",
       " 173,\n",
       " 174,\n",
       " 175,\n",
       " 176,\n",
       " 177,\n",
       " 178,\n",
       " 179,\n",
       " 180,\n",
       " 181,\n",
       " 182,\n",
       " 183,\n",
       " 184,\n",
       " 185,\n",
       " 186,\n",
       " 187,\n",
       " 188,\n",
       " 189,\n",
       " 190,\n",
       " 191,\n",
       " 192,\n",
       " 193,\n",
       " 194,\n",
       " 195,\n",
       " 196,\n",
       " 197,\n",
       " 198,\n",
       " 199,\n",
       " 200,\n",
       " 201,\n",
       " 202,\n",
       " 203,\n",
       " 204,\n",
       " 205,\n",
       " 206,\n",
       " 207,\n",
       " 208,\n",
       " 209,\n",
       " 210,\n",
       " 211,\n",
       " 212,\n",
       " 213,\n",
       " 214,\n",
       " 215,\n",
       " 216]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4,5,6,5,5,6,5,5,5,6,5,5,5,5,5,5,5,6,5,6,5,5,5,5,5,6,5,5,5,6,5,5,6,5,5,5,6,5,5,5,5,5,5,5,5,5,6,6,5,5,5,7,6,5,5,5,5,5,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]/span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>“第23次「日抛关系」后，我心动了。”</td>\n",
       "      <td>2021-05-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-05-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>“ ......敢不敢赌我喜欢你？”</td>\n",
       "      <td>2021-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>“妈，你也可以试试「卫生棉条」。”</td>\n",
       "      <td>2021-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>是否删除2020不好的回忆？\\n\\n          【是】   【否】</td>\n",
       "      <td>2021-02-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>马上就到除夕啦\\n\\n下图是浮世号列车🚞里的一个彩蛋\\n\\n欢迎长按保存图片，放置朋友圈\\n...</td>\n",
       "      <td>2021-02-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>“......你呢？喜欢现在的自己吗？”</td>\n",
       "      <td>2021-02-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-02-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>如果你还在为自己孤单寂寞怀才不遇举世皆浊我独醒而深深叹息的话，那么让我告诉你：\\n\\n你买不...</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-04-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>没人知道，他是我的「性幻想对象」。</td>\n",
       "      <td>2021-04-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                title create_time  \\\n",
       "0                                 “第23次「日抛关系」后，我心动了。”  2021-05-19   \n",
       "1                                                分享图片  2021-05-18   \n",
       "2                                  “ ......敢不敢赌我喜欢你？”  2021-05-17   \n",
       "3                                   “妈，你也可以试试「卫生棉条」。”  2021-05-16   \n",
       "4               是否删除2020不好的回忆？\\n\\n          【是】   【否】  2021-02-11   \n",
       "5   马上就到除夕啦\\n\\n下图是浮世号列车🚞里的一个彩蛋\\n\\n欢迎长按保存图片，放置朋友圈\\n...  2021-02-10   \n",
       "6                                “......你呢？喜欢现在的自己吗？”  2021-02-09   \n",
       "7                                                分享图片  2021-02-08   \n",
       "8   如果你还在为自己孤单寂寞怀才不遇举世皆浊我独醒而深深叹息的话，那么让我告诉你：\\n\\n你买不...  2021-02-07   \n",
       "9                                                分享图片  2021-04-26   \n",
       "10                                  没人知道，他是我的「性幻想对象」。  2021-04-24   \n",
       "\n",
       "                                                 link  \n",
       "0   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "2   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "3   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "4   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "5   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "6   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "7   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "8   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "9   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "10  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  "
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1089</th>\n",
       "      <td>“ 祝 我 永 不 坠 入 爱 河 。”</td>\n",
       "      <td>2020-03-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1090</th>\n",
       "      <td>“ 好 想 陪 你 看 樱 花 。 ”</td>\n",
       "      <td>2020-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1091</th>\n",
       "      <td>“没什么，就想和你好好睡一觉。”</td>\n",
       "      <td>2020-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1092</th>\n",
       "      <td>最近过得还好吗？</td>\n",
       "      <td>2020-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1093</th>\n",
       "      <td>“喵唔……我真的好想你。”</td>\n",
       "      <td>2020-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     title create_time  \\\n",
       "1089  “ 祝 我 永 不 坠 入 爱 河 。”  2020-03-24   \n",
       "1090   “ 好 想 陪 你 看 樱 花 。 ”  2020-03-23   \n",
       "1091      “没什么，就想和你好好睡一觉。”  2020-03-21   \n",
       "1092              最近过得还好吗？  2020-03-17   \n",
       "1093         “喵唔……我真的好想你。”  2020-03-14   \n",
       "\n",
       "                                                   link  \n",
       "1089  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1090  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1091  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1092  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1093  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  "
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>value</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-05-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>“妈，你也可以试试「卫生棉条」。”</td>\n",
       "      <td>2021-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>是否删除2020不好的回忆？\\n\\n          【是】   【否】</td>\n",
       "      <td>2021-02-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-02-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-04-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300</th>\n",
       "      <td>7,500,000,000 个藏在数字里的人。</td>\n",
       "      <td>2020-04-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>301</th>\n",
       "      <td>来吧，住进【星河手帐】</td>\n",
       "      <td>2020-04-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>302</th>\n",
       "      <td>我在「色聊群」潜伏的这一年。</td>\n",
       "      <td>2020-04-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>306</th>\n",
       "      <td>定了！【星河手帐】设计！</td>\n",
       "      <td>2020-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>307</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2020-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>129 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       title create_time  \\\n",
       "value                                                      \n",
       "1                                       分享图片  2021-05-18   \n",
       "3                          “妈，你也可以试试「卫生棉条」。”  2021-05-16   \n",
       "4      是否删除2020不好的回忆？\\n\\n          【是】   【否】  2021-02-11   \n",
       "7                                       分享图片  2021-02-08   \n",
       "9                                       分享图片  2021-04-26   \n",
       "...                                      ...         ...   \n",
       "300                  7,500,000,000 个藏在数字里的人。  2020-04-10   \n",
       "301                              来吧，住进【星河手帐】  2020-04-09   \n",
       "302                           我在「色聊群」潜伏的这一年。  2020-04-08   \n",
       "306                             定了！【星河手帐】设计！  2020-03-31   \n",
       "307                                     分享图片  2020-03-29   \n",
       "\n",
       "                                                    link  \n",
       "value                                                     \n",
       "1      http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "3      http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "4      http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "7      http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "9      http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "...                                                  ...  \n",
       "300    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "301    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "302    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "306    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "307    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "\n",
       "[129 rows x 3 columns]"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"喜欢\", \"关系\", \"在意\",\"爱\",\"男朋友\",\"分手\",\"恋爱\",\"性\",\\\n",
    "                \"愿意\",\"在一起\",\\\n",
    "                \"妈妈\",\"人生\",\"日子\",\"父母\",\"爸妈\",\"亲情\",\\\n",
    "                \"女孩\",\"焦虑\",\"累\",\"难过\",\"哭泣\",\"自己\",\\\n",
    "                \"孤独\", \"释放\",\"敷衍\",\"心动\",\"熬\", \"期待\",\"麻烦\",\"认真\",\\\n",
    "                \"女友\", \"忠诚\",\\\n",
    "                \"信赖\",\\\n",
    "                \"相遇\",\\\n",
    "                \"出柜\",\"gay\",\"百合\",\"同性\",\"不婚\",\\\n",
    "                \"对不起\",\"抱歉\",\"谢谢\",\\\n",
    "                \"自由\", \"遗憾\", \"崩溃\", \"忘\", \"开心\",\\\n",
    "                \"属于\",\"羞耻\",\"再见\",\"厌倦\", \"丧\",\\\n",
    "                \"下意识\", \"忽略\",\\\n",
    "                \"结婚\",\"告白\", \\\n",
    "                \"冷淡\", \"逃跑\", \"需要\", \"永远\",\"循环\",\"有意思\",\\\n",
    "                \"有趣\", \"标价\",\"心上\",\"快乐\",\"相爱\",\"终结\",\\\n",
    "                \"睡\",\"最近\",\"聊天\",\"声音\",\"加油\",\\\n",
    "                \"偷偷\", \"抱抱\", \"甜\", \"梦想\", \"离开\", \"未来\", \"糟糕\",\\\n",
    "                \"单身\",\"出售\",\"幸福\",\"开口\",\"心情\",\"心碎\", \"想\", \"想念\", \"美好\",\"瞬间\",\"备胎\",\\\n",
    "                ] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.to_list()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ==&mid=2247518455&idx=1&sn=82d4020c898ddcd8cdab92385eb0d424&chksm=e9edf44bde9a7d5d0fe11c3bcd4e6014215b44d41a88e7346394e23aab64cfc43345666d66ff#rd'"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>是否删除2020不好的回忆？\\n\\n          【是】   【否】</td>\n",
       "      <td>2021-02-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>马上就到除夕啦\\n\\n下图是浮世号列车🚞里的一个彩蛋\\n\\n欢迎长按保存图片，放置朋友圈\\n...</td>\n",
       "      <td>2021-02-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>“......你呢？喜欢现在的自己吗？”</td>\n",
       "      <td>2021-02-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-02-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>如果你还在为自己孤单寂寞怀才不遇举世皆浊我独醒而深深叹息的话，那么让我告诉你：\\n\\n你买不...</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1089</th>\n",
       "      <td>“ 祝 我 永 不 坠 入 爱 河 。”</td>\n",
       "      <td>2020-03-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1090</th>\n",
       "      <td>“ 好 想 陪 你 看 樱 花 。 ”</td>\n",
       "      <td>2020-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1091</th>\n",
       "      <td>“没什么，就想和你好好睡一觉。”</td>\n",
       "      <td>2020-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1092</th>\n",
       "      <td>最近过得还好吗？</td>\n",
       "      <td>2020-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1093</th>\n",
       "      <td>“喵唔……我真的好想你。”</td>\n",
       "      <td>2020-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>785 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  title create_time  \\\n",
       "57                是否删除2020不好的回忆？\\n\\n          【是】   【否】  2021-02-11   \n",
       "58    马上就到除夕啦\\n\\n下图是浮世号列车🚞里的一个彩蛋\\n\\n欢迎长按保存图片，放置朋友圈\\n...  2021-02-10   \n",
       "59                                 “......你呢？喜欢现在的自己吗？”  2021-02-09   \n",
       "60                                                 分享图片  2021-02-08   \n",
       "61    如果你还在为自己孤单寂寞怀才不遇举世皆浊我独醒而深深叹息的话，那么让我告诉你：\\n\\n你买不...  2021-02-07   \n",
       "...                                                 ...         ...   \n",
       "1089                               “ 祝 我 永 不 坠 入 爱 河 。”  2020-03-24   \n",
       "1090                                “ 好 想 陪 你 看 樱 花 。 ”  2020-03-23   \n",
       "1091                                   “没什么，就想和你好好睡一觉。”  2020-03-21   \n",
       "1092                                           最近过得还好吗？  2020-03-17   \n",
       "1093                                      “喵唔……我真的好想你。”  2020-03-14   \n",
       "\n",
       "                                                   link  \n",
       "57    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "58    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "59    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "60    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "61    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "...                                                 ...  \n",
       "1089  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1090  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1091  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1092  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1093  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "\n",
       "[785 rows x 3 columns]"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>“第23次「日抛关系」后，我心动了。”</td>\n",
       "      <td>2021-05-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-05-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>“ ......敢不敢赌我喜欢你？”</td>\n",
       "      <td>2021-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>“妈，你也可以试试「卫生棉条」。”</td>\n",
       "      <td>2021-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>是否删除2020不好的回忆？\\n\\n          【是】   【否】</td>\n",
       "      <td>2021-02-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>309</th>\n",
       "      <td>“ 祝 我 永 不 坠 入 爱 河 。”</td>\n",
       "      <td>2020-03-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>310</th>\n",
       "      <td>“ 好 想 陪 你 看 樱 花 。 ”</td>\n",
       "      <td>2020-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>311</th>\n",
       "      <td>“没什么，就想和你好好睡一觉。”</td>\n",
       "      <td>2020-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>312</th>\n",
       "      <td>最近过得还好吗？</td>\n",
       "      <td>2020-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>313</th>\n",
       "      <td>“喵唔……我真的好想你。”</td>\n",
       "      <td>2020-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>309 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     title create_time  \\\n",
       "0                      “第23次「日抛关系」后，我心动了。”  2021-05-19   \n",
       "1                                     分享图片  2021-05-18   \n",
       "2                       “ ......敢不敢赌我喜欢你？”  2021-05-17   \n",
       "3                        “妈，你也可以试试「卫生棉条」。”  2021-05-16   \n",
       "4    是否删除2020不好的回忆？\\n\\n          【是】   【否】  2021-02-11   \n",
       "..                                     ...         ...   \n",
       "309                   “ 祝 我 永 不 坠 入 爱 河 。”  2020-03-24   \n",
       "310                    “ 好 想 陪 你 看 樱 花 。 ”  2020-03-23   \n",
       "311                       “没什么，就想和你好好睡一觉。”  2020-03-21   \n",
       "312                               最近过得还好吗？  2020-03-17   \n",
       "313                          “喵唔……我真的好想你。”  2020-03-14   \n",
       "\n",
       "                                                  link  \n",
       "0    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "1    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "2    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "3    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "4    http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "..                                                 ...  \n",
       "309  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "310  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "311  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "312  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "313  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...  \n",
       "\n",
       "[309 rows x 3 columns]"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>“第23次「日抛关系」后，我心动了。”</td>\n",
       "      <td>2021-05-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>心动</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>分享图片</td>\n",
       "      <td>2021-05-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>“ ......敢不敢赌我喜欢你？”</td>\n",
       "      <td>2021-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>喜欢</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>“妈，你也可以试试「卫生棉条」。”</td>\n",
       "      <td>2021-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>是否删除2020不好的回忆？\\n\\n          【是】   【否】</td>\n",
       "      <td>2021-02-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1089</th>\n",
       "      <td>“ 祝 我 永 不 坠 入 爱 河 。”</td>\n",
       "      <td>2020-03-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>爱</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1090</th>\n",
       "      <td>“ 好 想 陪 你 看 樱 花 。 ”</td>\n",
       "      <td>2020-03-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>想</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1091</th>\n",
       "      <td>“没什么，就想和你好好睡一觉。”</td>\n",
       "      <td>2020-03-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>想</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1092</th>\n",
       "      <td>最近过得还好吗？</td>\n",
       "      <td>2020-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>最近</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1093</th>\n",
       "      <td>“喵唔……我真的好想你。”</td>\n",
       "      <td>2020-03-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>想</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1094 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title create_time  \\\n",
       "0                       “第23次「日抛关系」后，我心动了。”  2021-05-19   \n",
       "1                                      分享图片  2021-05-18   \n",
       "2                        “ ......敢不敢赌我喜欢你？”  2021-05-17   \n",
       "3                         “妈，你也可以试试「卫生棉条」。”  2021-05-16   \n",
       "4     是否删除2020不好的回忆？\\n\\n          【是】   【否】  2021-02-11   \n",
       "...                                     ...         ...   \n",
       "1089                   “ 祝 我 永 不 坠 入 爱 河 。”  2020-03-24   \n",
       "1090                    “ 好 想 陪 你 看 樱 花 。 ”  2020-03-23   \n",
       "1091                       “没什么，就想和你好好睡一觉。”  2020-03-21   \n",
       "1092                               最近过得还好吗？  2020-03-17   \n",
       "1093                          “喵唔……我真的好想你。”  2020-03-14   \n",
       "\n",
       "                                                   link variable  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       心动  \n",
       "1     http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...     无法分类  \n",
       "2     http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       喜欢  \n",
       "3     http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...     无法分类  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...     无法分类  \n",
       "...                                                 ...      ...  \n",
       "1089  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...        爱  \n",
       "1090  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...        想  \n",
       "1091  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...        想  \n",
       "1092  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       最近  \n",
       "1093  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...        想  \n",
       "\n",
       "[1094 rows x 4 columns]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>翻8个女孩的购物车，我发现了「穿衣焦虑」。</td>\n",
       "      <td>2021-04-12</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>焦虑</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>“女孩子太主动，样子不好看的。”</td>\n",
       "      <td>2020-12-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>女孩</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>181</th>\n",
       "      <td>最近一段时间，和几个处于人生瓶颈期的朋友聊天。\\n\\n他们觉得现在的生活无趣，都想做一些转型...</td>\n",
       "      <td>2020-09-12</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>想</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>235</th>\n",
       "      <td>那些被下意识忽略的女孩子。</td>\n",
       "      <td>2020-06-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>忽略</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>266</th>\n",
       "      <td>“今晚，我要找101个眼里有光的女孩。”</td>\n",
       "      <td>2020-05-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...</td>\n",
       "      <td>女孩</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 title create_time  \\\n",
       "18                               翻8个女孩的购物车，我发现了「穿衣焦虑」。  2021-04-12   \n",
       "104                                   “女孩子太主动，样子不好看的。”  2020-12-17   \n",
       "181  最近一段时间，和几个处于人生瓶颈期的朋友聊天。\\n\\n他们觉得现在的生活无趣，都想做一些转型...  2020-09-12   \n",
       "235                                      那些被下意识忽略的女孩子。  2020-06-29   \n",
       "266                               “今晚，我要找101个眼里有光的女孩。”  2020-05-18   \n",
       "\n",
       "                                                  link variable  \n",
       "18   http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       焦虑  \n",
       "104  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       女孩  \n",
       "181  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...        想  \n",
       "235  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       忽略  \n",
       "266  http://mp.weixin.qq.com/s?__biz=MzI1MTY4NjM1NQ...       女孩  "
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"女孩\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>想</th>\n",
       "      <td>528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>爱</th>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最近</th>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>无法分类</th>\n",
       "      <td>129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>恋爱</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>自己</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>性</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>抱歉</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>永远</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>喜欢</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>聊天</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>心动</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>忘</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>瞬间</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>累</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>对不起</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>开心</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>父母</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>结婚</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>美好</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>羞耻</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>遗憾</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>丧</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>麻烦</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>孤独</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>分手</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>出柜</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>在一起</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>女孩</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>关系</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>再见</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>出售</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>偷偷</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>释放</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>难过</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>同性</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>告白</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>睡</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>相遇</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>相爱</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>爸妈</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>哭泣</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>心情</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>忠诚</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>焦虑</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>在意</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>标价</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>未来</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>期待</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>有意思</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>女友</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>日子</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>愿意</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>信赖</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>属于</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>崩溃</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>忽略</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>快乐</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>熬</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          title\n",
       "variable       \n",
       "想           528\n",
       "爱           166\n",
       "最近          157\n",
       "无法分类        129\n",
       "恋爱           10\n",
       "自己            7\n",
       "性             6\n",
       "抱歉            5\n",
       "永远            4\n",
       "喜欢            4\n",
       "聊天            4\n",
       "心动            4\n",
       "忘             3\n",
       "瞬间            3\n",
       "累             3\n",
       "对不起           3\n",
       "开心            3\n",
       "父母            2\n",
       "结婚            2\n",
       "美好            2\n",
       "羞耻            2\n",
       "遗憾            2\n",
       "丧             2\n",
       "麻烦            2\n",
       "孤独            2\n",
       "分手            2\n",
       "出柜            2\n",
       "在一起           2\n",
       "女孩            2\n",
       "关系            2\n",
       "再见            1\n",
       "出售            1\n",
       "偷偷            1\n",
       "释放            1\n",
       "难过            1\n",
       "同性            1\n",
       "告白            1\n",
       "睡             1\n",
       "相遇            1\n",
       "相爱            1\n",
       "爸妈            1\n",
       "哭泣            1\n",
       "心情            1\n",
       "忠诚            1\n",
       "焦虑            1\n",
       "在意            1\n",
       "标价            1\n",
       "未来            1\n",
       "期待            1\n",
       "有意思           1\n",
       "女友            1\n",
       "日子            1\n",
       "愿意            1\n",
       "信赖            1\n",
       "属于            1\n",
       "崩溃            1\n",
       "忽略            1\n",
       "快乐            1\n",
       "熬             1"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'stats'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 遇到问题"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 点击图文信息代码核对正确却无反应，出现NoSuchElementException: Unable to locate element的提示\n",
    "- 解决方法：参考了网上帖子：[解决网页元素无法定位](https://blog.csdn.net/mrlevo520/article/details/51954203)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
